## [1] "MMSD-P2"  "MMSD-P7"  "MMSD-P8"  "MMSD-P11" "MMSD-P18" "Madison"

1 Data: The first look

The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from 25 January 2021 to 04 January 2022.

Date Site FirstConfirmed SevenDayMACases N1 BCoV N2 PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt
2021-01-25 MMSD-P11 8 10.571429 280459 8.99 354015 31434596 315097.91 9.31 NA 1
2021-01-25 MMSD-P18 17 8.571429 310278 10.37 447153 29445680 372480.52 11.82 NA 1
2021-01-25 MMSD-P2 13 9.571429 62166 11.38 94675 35099462 76717.44 5.74 NA 1
2021-01-25 MMSD-P7 5 10.000000 64345 10.95 107125 22900169 83023.84 4.35 NA 1
2021-01-25 MMSD-P8 4 12.000000 66705 11.63 110510 22716404 85857.85 6.06 NA 1
2021-01-28 MMSD-P11 19 24.857143 1107390 1.98 1098130 24862163 1102750.28 9.07 NA 1

The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First, wastewater data is noisy. And that there is a clear relationship between the two signals.

FirstImpresionFunc <- function(DF){
  FirstImpressionDF <- DF%>%
    NoNa(SelectedIndVar,SelectedDepVar)#Removing NA
  FirstImpressionGGplot = FirstImpressionDF%>%
  ggplot(aes(x=Date))+#Data depends on time
  geom_point(aes(y=!!sym(SelectedDepVar), color=SelectedDepVar,info=!!sym(SelectedDepVar)),size = 1)+
  geom_line(aes(y=MinMaxFixing(!!sym(SelectedIndVar),!!sym(SelectedDepVar)), 
                color=SelectedIndVar,
                info=!!sym(SelectedIndVar)))+#compares SelectedIndVar to Cases
  geom_line(aes(y=(SevenDayMACases), 
                color="Seven Day MA Cases",
                info=SevenDayMACases))+
  labs(y="Reported cases")+
    facet_wrap(~Site)
  return(ggplotly(FirstImpressionGGplot))
}
FirstImpresionFunc(filter(FullDF,Site=="Madison"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P2"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P7"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P8"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P11"))
FirstImpresionFunc(filter(FullDF,Site=="MMSD-P18"))

2 Removing potential outliers

Looking at the wastewater measurements we observe there were some points many times larger than adjacent values hinting at them being outliers. We used the adjacent 10 values on each side and marked points 2.5 standard deviations away from the group mean as outliers.

OutlierRemoveFunc <- function(DF){
  ErrorMarkedDF <- DF%>%#
    mutate(FlagedOutliers = IdentifyOutliers(!!sym(SelectedIndVar),Bin = 21, Action = "Flag"),
           #Manual flagging that method misses due to boundary effect with binning
           FlagedOutliers = ifelse(Date == mdy("01/26/2021") | Date == mdy("01/27/2021"),
                                   TRUE, FlagedOutliers),
           NoOutlierVar = ifelse(FlagedOutliers, NA, !!sym(SelectedIndVar)))
  return(ErrorMarkedDF)
}
MadDF <- OutlierRemoveFunc(filter(FullDF,Site=="Madison"))
P2DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P2"))
P7DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P7"))
P8DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P8"))
P11DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P11"))
P18DF <- OutlierRemoveFunc(filter(FullDF,Site=="MMSD-P18"))

OutlierPlotFunc <- function(DF){
  #Split N1 into outlier and non outlier for next ggplot
OutLierPlotDF <- DF%>%
  mutate(!!OutlierName := ifelse(FlagedOutliers,!!sym(SelectedIndVar),NA))%>%
           mutate(!!SelectedIndVar := NoOutlierVar)

OutLierPlotObject <- OutLierPlotDF%>%
  filter(!(is.na(!!sym(SelectedIndVar))&is.na(!!sym(OutlierName))))%>%
  ggplot(aes(x=Date))+#Data depends on time 
  geom_line(aes(y=!!sym(SelectedIndVar),
                color=SelectedIndVar, 
                info = !!sym(SelectedIndVar)))+#compares Var to Cases
  geom_point(aes(y=!!sym(OutlierName),
                 color=OutlierName,
                 info = !!sym(OutlierName)))+
  ColorRule+
    facet_wrap(~Site)

#mentioned hand picked list other choices
ReturnPlot <- ggplotly(OutLierPlotObject,tooltip=c("info","Date"))%>%
    layout(yaxis = SecondAxisFormat,
           legend=list(title=list(text=''),x = 1.15, y = 0.9),
           margin = list(l = 50, r = 75, b = 50, t = 50, pad = 4))
return(ReturnPlot)
}
OutlierPlotFunc(MadDF)
OutlierPlotFunc(P2DF)
OutlierPlotFunc(P7DF)
OutlierPlotFunc(P8DF)
OutlierPlotFunc(P11DF)
OutlierPlotFunc(P18DF)
PrepOutlierFunc <- function(DF){
  #Drop Var create Var filter 
UpdatedDF <- DF%>%
  select(-sym(SelectedIndVar))%>%
  rename(!!sym(SelectedIndVar) := NoOutlierVar)
return(UpdatedDF)
}

#MadDF <- PrepOutlierFunc(MadDF)
#P2DF <- PrepOutlierFunc(P2DF)
#P7DF <- PrepOutlierFunc(P7DF)
#P8DF <- PrepOutlierFunc(P8DF)
#P11DF <- PrepOutlierFunc(P11DF)
#P18DF <- PrepOutlierFunc(P18DF)


#FullDF

DetectedMain <- MadDF%>%
  filter(FlagedOutliers)%>%
  select(Date,FlagedOutliers)

StevePlot <- FullDF%>%
  ggplot()+
  aes(x=Date)+
  geom_vline(aes(xintercept=Date),color="red",data=DetectedMain)+
  geom_point(aes(y=N1))+
  facet_wrap(~Site, scale = "free_y")

StevePlot

A <- MadDF%>%
  filter(FlagedOutliers)
B <- P2DF%>%
  filter(FlagedOutliers)
C <- P7DF%>%
  filter(FlagedOutliers)
D <- P8DF%>%
  filter(FlagedOutliers)
E <- P11DF%>%
  filter(FlagedOutliers)
fF <- P18DF%>%
  filter(FlagedOutliers)

AllErrors <- full_join(full_join(full_join(full_join(full_join(A,B),C),D),E),fF)

AllErrors
##          Date     Site FirstConfirmed SevenDayMACases      N1   BCoV      N2
## 1  2021-01-26  Madison             56       89.428571 1831813  6.930 1686654
## 2  2021-01-27  Madison            162       88.857143 1357329  2.059 1323010
## 3  2021-04-15  Madison             56       50.285714  571269  3.684  706418
## 4  2021-04-26  Madison             NA       39.333333  675748 16.079  546554
## 5  2021-05-02  Madison             26       37.333333  876165  4.710 1134521
## 6  2021-05-14  Madison             37       23.500000  324203  5.070  465104
## 7  2021-06-06  Madison             NA        8.000000  658991  9.570  647639
## 8  2021-06-08  Madison              8        8.000000 1459947  0.982 1464354
## 9  2021-07-26  Madison             24       43.333333  186460  3.589  143034
## 10 2021-07-28  Madison             62       48.428571  260334 27.638  207880
## 11 2021-10-17  Madison             58       56.428571  823463  1.686  693348
## 12 2021-11-26  Madison             55      104.600000 1705574  6.299 1758779
## 13 2021-01-26  MMSD-P2             22       16.428571      NA     NA      NA
## 14 2021-01-27  MMSD-P2             48       26.714286      NA     NA      NA
## 15 2021-01-26  MMSD-P7              3       18.428571      NA     NA      NA
## 16 2021-01-27  MMSD-P7             15       28.142857      NA     NA      NA
## 17 2021-01-26  MMSD-P8              7       23.428571      NA     NA      NA
## 18 2021-01-27  MMSD-P8             24       25.428571      NA     NA      NA
## 19 2021-01-26 MMSD-P11             13       10.000000      NA     NA      NA
## 20 2021-01-27 MMSD-P11             56       24.142857      NA     NA      NA
## 21 2021-01-26 MMSD-P18             10        9.142857      NA     NA      NA
## 22 2021-01-27 MMSD-P18             18       24.428571      NA     NA      NA
##       PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt FlagedOutliers
## 1  29748189  1757735.7    37.74          NA            1.000           TRUE
## 2  30102158  1340059.6    36.17          NA            1.000           TRUE
## 3  23628196   635259.6    39.02          NA            1.000           TRUE
## 4  28619982   607727.5    36.04          NA            0.250           TRUE
## 5  42099107   997009.3    35.23          NA            0.250           TRUE
## 6  34292122   388314.5    36.12          NA            0.625           TRUE
## 7  32185831   653290.3    34.39          NA            1.000           TRUE
## 8  54673826  1462148.8    36.40          NA            1.000           TRUE
## 9  30230924   163309.9    34.93          NA            1.000           TRUE
## 10 37320637   232633.3    36.82          NA            1.000           TRUE
## 11 29427968   755610.0    35.38          NA            0.625           TRUE
## 12 26466528  1731972.2    32.48          NA            0.625           TRUE
## 13       NA         NA       NA          NA               NA           TRUE
## 14       NA         NA       NA          NA               NA           TRUE
## 15       NA         NA       NA          NA               NA           TRUE
## 16       NA         NA       NA          NA               NA           TRUE
## 17       NA         NA       NA          NA               NA           TRUE
## 18       NA         NA       NA          NA               NA           TRUE
## 19       NA         NA       NA          NA               NA           TRUE
## 20       NA         NA       NA          NA               NA           TRUE
## 21       NA         NA       NA          NA               NA           TRUE
## 22       NA         NA       NA          NA               NA           TRUE
##    NoOutlierVar
## 1            NA
## 2            NA
## 3            NA
## 4            NA
## 5            NA
## 6            NA
## 7            NA
## 8            NA
## 9            NA
## 10           NA
## 11           NA
## 12           NA
## 13           NA
## 14           NA
## 15           NA
## 16           NA
## 17           NA
## 18           NA
## 19           NA
## 20           NA
## 21           NA
## 22           NA
#On a given day we have 5 intercepters

OutlierPlotFunc(MadDF)
OutlierPlotFunc(P2DF)
OutlierPlotFunc(P7DF)
OutlierPlotFunc(P8DF)
OutlierPlotFunc(P11DF)
OutlierPlotFunc(P18DF)
AllErrors%>%
  ggplot()+
  aes(x=Date)+
  geom_point(aes(y=N1,color=Site),size=2)+
  geom_vline(xintercept = ymd("2021-04-26"))+
  geom_vline(xintercept = ymd("2021-11-18"))+
  facet_wrap(~Site, scale = "free_y")

AllErrors%>%
  filter(!is.na(N1))%>%
  group_by(Date)%>%
  summarize(n())
## # A tibble: 12 x 2
##    Date       `n()`
##    <date>     <int>
##  1 2021-01-26     1
##  2 2021-01-27     1
##  3 2021-04-15     1
##  4 2021-04-26     1
##  5 2021-05-02     1
##  6 2021-05-14     1
##  7 2021-06-06     1
##  8 2021-06-08     1
##  9 2021-07-26     1
## 10 2021-07-28     1
## 11 2021-10-17     1
## 12 2021-11-26     1
#"2021-01-26" 
#"2021-06-06"
#"2021-06-08"
#"2021-10-17"
#"2021-11-26"

"2021-04-26"
## [1] "2021-04-26"
"2021-11-18"
## [1] "2021-11-18"
#P2 error seen in madison: maybe?
#P7 error seen in madison: maybe?
#P8 error seen in madison: no
#P8 error seen in madison: no
#P18 error seen in madison: yes, sometimes
LoessPlotFunc <- function(DF,SpanConstant = .163){
  MainDF <- DF
  MainDF[[loessVar]] <- loessFit(y=(MainDF[[SelectedIndVar]]), 
                      x=MainDF$Date, #create loess fit of the data
                      span=SpanConstant, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns
  
  MainDF <- MainDF%>%
    NoNa(loessVar,"SevenDayMACases")
  SLDLoessGraphic <- MainDF%>%
    ggplot(aes(x=Date))+
    geom_line(aes(y=!!sym(SelectedDepVar), color=SelectedDepVar , info = !!sym(SelectedDepVar)),alpha=.1)+
    geom_line(aes(y=MinMaxFixing(!!sym(SelectedIndVar),!!sym(SelectedDepVar)),
                color=SelectedIndVar,
                info = !!sym(SelectedIndVar)),
            alpha=.2)+
    geom_line(aes(y=SevenDayMACases,
                color="Seven Day MA Cases" , 
                info = SevenDayMACases))+
    geom_line(aes(y=MinMaxFixing(!!sym(loessVar),!!sym(SelectedDepVar),!!sym(SelectedIndVar)), 
                color=loessVar ,
                info = !!sym(loessVar)))+
    facet_wrap(~Site)+
    labs(y="Reported cases")


ReturnPlot <- ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))%>%
    add_lines(x=~Date, y=MainDF[[SelectedIndVar]],
            yaxis="y2", data=MainDF, showlegend=FALSE, inherit=FALSE) %>%
    layout(yaxis2 = SecondAxisFormat,
           legend=list(title=list(text=''),x = 1.15, y = 0.9),
           margin = list(l = 50, r = 75, b = 50, t = 50, pad = 4))
return(ReturnPlot)
}


LoessPlotFunc(MadDF)
LoessPlotFunc(P2DF)
LoessPlotFunc(P7DF)
LoessPlotFunc(P8DF)
LoessPlotFunc(P11DF)
LoessPlotFunc(P18DF)